{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import (absolute_import, division,\n",
    "                        print_function, unicode_literals)\n",
    "from builtins import *\n",
    "from io import open\n",
    "\n",
    "import time, datetime\n",
    "\n",
    "from NoisyNLP.utils import *\n",
    "from NoisyNLP.features import *\n",
    "from NoisyNLP.models import *\n",
    "from NoisyNLP.experiments import *\n",
    "\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_files = [\"./data/cleaned/train.BIEOU.tsv\"]\n",
    "dev_files = [\"./data/cleaned/dev.BIEOU.tsv\", \"./data/cleaned/dev_2015.BIEOU.tsv\"]\n",
    "test_files = [\"./data/cleaned/test.BIEOU.tsv\"]\n",
    "vocab_file = \"./vocab.no_extras.txt\"\n",
    "outdir = \"./test_exp\"\n",
    "test_enriched_data_brown_cluster_dir=\"test_enriched/brown_clusters/\"\n",
    "test_enriched_data_clark_cluster_dir=\"test_enriched/clark_clusters/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total sequences:  7670\n"
     ]
    }
   ],
   "source": [
    "exp = Experiment(outdir, train_files, dev_files, test_files, vocab_file)\n",
    "all_sequences = [[preprocess_token(t[0], to_lower=True) for t in seq] \n",
    "                        for seq in (exp.train_sequences + exp.dev_sequences + exp.test_sequences)]\n",
    "print(\"Total sequences: \", len(all_sequences))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "brown_exec_path=\"/home/entity/Downloads/brown-cluster/wcluster\"\n",
    "brown_input_data_path=\"test_enriched/all_sequences.brown.txt\"\n",
    "test_enriched_data_brown_cf = ClusterFeatures(test_enriched_data_brown_cluster_dir,\n",
    "                                              cluster_type=\"brown\", n_clusters=100)\n",
    "test_enriched_data_brown_cf.set_exec_path(brown_exec_path)\n",
    "test_enriched_data_brown_cf.gen_training_data(all_sequences, brown_input_data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test_enriched_data_brown_cf.gen_clusters(brown_input_data_path, test_enriched_data_brown_cluster_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clark Clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clark_exec_path=\"/home/entity/Downloads/clark_pos_induction/src/bin/cluster_neyessenmorph\"\n",
    "clark_input_data_path=\"test_enriched/all_sequences.clark.txt\"\n",
    "test_enriched_data_clark_cf = ClusterFeatures(test_enriched_data_clark_cluster_dir,\n",
    "                                              cluster_type=\"clark\", n_clusters=32)\n",
    "test_enriched_data_clark_cf.set_exec_path(clark_exec_path)\n",
    "test_enriched_data_clark_cf.gen_training_data(all_sequences, clark_input_data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test_enriched_data_clark_cf.gen_clusters(clark_input_data_path, test_enriched_data_clark_cluster_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
